import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import folium
# Define file paths
population_file_path = 'C:/Users/Serhiy/Documents/Python Scripts/kyiv-store-population-analysis/data/processed/Population_with_Nearest_Store.csv'
# Load the data
population_df = pd.read_csv(population_file_path)
# Display the first few rows of each DataFrame
print("Population DataFrame:")
print(population_df.head())
Population DataFrame:
Latitude Longitude Metric population Building Id Nearest Store \
0 50.435976 30.625646 1.96 1 Store 84
1 50.435185 30.626106 1.95 2 Store 84
2 50.436381 30.626037 1.70 3 Store 84
3 50.436153 30.626818 2.09 4 Store 84
4 50.435430 30.626640 2.13 5 Store 84
Distance to Nearest Store (km)
0 1.372
1 1.286
2 1.385
3 1.329
4 1.279
Objective: To identify potential new store locations by analyzing the population that is currently underserved by existing stores.
Reasoning: In this section, we will filter the population data to identify buildings that are more than 1 kilometer away from the nearest store. This will help us focus our analysis on the underserved population, ensuring that new store locations are chosen in areas with the highest need. By setting a 1-kilometer threshold, we aim to minimize the cannibalization effect and allocate resources efficiently.
# Assuming population_df is already loaded and filtered for distance > 1 km
filtered_population_df = population_df[population_df['Distance to Nearest Store (km)'] > 1]
# Create a map centered around Kyiv
kyiv_map = folium.Map(location=[50.4501, 30.5234], zoom_start=12)
# Add building points to the map with circle size based on Metric population
for idx, row in filtered_population_df.iterrows():
folium.CircleMarker(
location=[row['Latitude'], row['Longitude']],
radius=row['Metric population'] * 2, # Adjust the multiplier as necessary to scale circle sizes
color=None, # No border color
weight=0, # No border weight
fill=True,
fill_color='green',
fill_opacity=0.6 # Set fill_opacity to 0.6 for 60% opacity
).add_to(kyiv_map)
# Save the map to the specified folder
map_output_path = 'C:/Users/Serhiy/Documents/Python Scripts/kyiv-store-population-analysis/results/kyiv_population_map_1_km_away_from_stores.html'
kyiv_map.save(map_output_path)
# Display the map
kyiv_map
To suggest the best locations for opening a new store, we can use clustering. We will use the K-Means clustering algorithm to group the buildings that are more than 1 kilometer away from the nearest store. K-Means clustering is a popular method for partitioning data into distinct groups, which can help identify optimal locations for new stores.
Distance Constraints: The K-Means algorithm may not always perfectly meet the 1-kilometer distance constraint within clusters. If strict adherence to the distance constraint is required, other clustering algorithms like DBSCAN or hierarchical clustering can be considered.
Choosing the Number of Clusters: Determining the optimal number of clusters is crucial. We will use 55 clusters based on the following analysis:
# Extract coordinates for clustering
coordinates = filtered_population_df[['Latitude', 'Longitude']].values
# Apply K-Means clustering
num_clusters = 55
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
kmeans.fit(coordinates)
# Add cluster labels to the DataFrame using .loc to avoid SettingWithCopyWarning
filtered_population_df.loc[:, 'Cluster'] = kmeans.labels_
# Get the cluster centers (new store locations)
new_store_locations = kmeans.cluster_centers_
# Print the centers of the clusters
print("Cluster Centers:")
print(new_store_locations)
Cluster Centers: [[50.43863117 30.54335424] [50.44476435 30.4248793 ] [50.47251979 30.62892821] [50.42192884 30.49191966] [50.39769671 30.38439242] [50.35259583 30.44424854] [50.48261199 30.39492072] [50.42113143 30.6930512 ] [50.43781388 30.64476495] [50.47997611 30.45652395] [50.46181421 30.36316156] [50.52781064 30.6250974 ] [50.39260039 30.49844933] [50.39807891 30.6010238 ] [50.43494112 30.61772154] [50.45383602 30.52408836] [50.40286465 30.51134239] [50.46306135 30.32578207] [50.3651799 30.55514492] [50.36491735 30.46080878] [50.39360103 30.4699805 ] [50.44873075 30.67329464] [50.4647108 30.42066778] [50.42037999 30.47263453] [50.50479779 30.58755881] [50.42527427 30.42941432] [50.43204665 30.32683221] [50.48844037 30.47833661] [50.4971983 30.43003724] [50.44700718 30.61534569] [50.38511708 30.69688223] [50.34930413 30.42405451] [50.43304154 30.50839898] [50.49087551 30.33309166] [50.34888522 30.47903213] [50.43672739 30.36536996] [50.39451368 30.65028167] [50.39880619 30.53052171] [50.47231677 30.59777972] [50.37922795 30.54216839] [50.47685864 30.67179204] [50.42616807 30.66753667] [50.38922953 30.62641383] [50.5291513 30.36486144] [50.41548613 30.38966591] [50.43850988 30.62793544] [50.45051277 30.51237422] [50.38653045 30.48388426] [50.43660428 30.4723366 ] [50.46404503 30.4386744 ] [50.43631129 30.41410166] [50.47570333 30.44218488] [50.40747267 30.36174217] [50.48891973 30.41072389] [50.42445905 30.55166319]]
C:\Users\Serhiy\AppData\Local\Temp\ipykernel_22684\1375290614.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_population_df.loc[:, 'Cluster'] = kmeans.labels_
# Assign buildings to the nearest new store based on clustering results
filtered_population_df = filtered_population_df.copy() # Avoid SettingWithCopyWarning by creating a copy
filtered_population_df.loc[:, 'Nearest New Store Id'] = filtered_population_df['Cluster'] + 1 # Cluster starts from 0
# Output the updated DataFrame for verification
filtered_population_df.head()
| Latitude | Longitude | Metric population | Building Id | Nearest Store | Distance to Nearest Store (km) | Cluster | Nearest New Store Id | |
|---|---|---|---|---|---|---|---|---|
| 0 | 50.435976 | 30.625646 | 1.96 | 1 | Store 84 | 1.372 | 45 | 46 |
| 1 | 50.435185 | 30.626106 | 1.95 | 2 | Store 84 | 1.286 | 45 | 46 |
| 2 | 50.436381 | 30.626037 | 1.70 | 3 | Store 84 | 1.385 | 45 | 46 |
| 3 | 50.436153 | 30.626818 | 2.09 | 4 | Store 84 | 1.329 | 45 | 46 |
| 4 | 50.435430 | 30.626640 | 2.13 | 5 | Store 84 | 1.279 | 45 | 46 |
# Create a DataFrame for potential new stores from the cluster centers
new_stores_df = pd.DataFrame(new_store_locations, columns=['Latitude', 'Longitude'])
new_stores_df['New Store Id'] = range(1, len(new_stores_df) + 1)
# Display the first few rows
print("New Stores DataFrame:")
print(new_stores_df.head())
New Stores DataFrame:
Latitude Longitude New Store Id
0 50.438631 30.543354 1
1 50.444764 30.424879 2
2 50.472520 30.628928 3
3 50.421929 30.491920 4
4 50.397697 30.384392 5
# Extract coordinates
filtered_population_coords = filtered_population_df[['Latitude', 'Longitude']].values
new_stores_coords = new_stores_df[['Latitude', 'Longitude']].values
# Calculate the Haversine distance between each potential new store and buildings from filtered_population_df
def haversine(lat1, lon1, lat2, lon2):
R = 6371.0 # Earth radius in kilometers
dlat = np.radians(lat2 - lat1)
dlon = np.radians(lon2 - lon1)
a = np.sin(dlat / 2) ** 2 + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dlon / 2) ** 2
c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
return R * c
# Calculate the distances using the Haversine formula
distances_list = []
for i, pop_coord in enumerate(filtered_population_coords):
for j, store_coord in enumerate(new_stores_coords):
dist = haversine(pop_coord[0], pop_coord[1], store_coord[0], store_coord[1])
distances_list.append({
'Building Id': int(filtered_population_df.iloc[i]['Building Id']),
'New Store Id': int(new_stores_df.iloc[j]['New Store Id']),
'Distance (km)': round(dist, 3)}
)
# Convert distances list to a DataFrame
distances_new_stores_df = pd.DataFrame(distances_list)
# Save the distances to the processed folder
distances_output_path = 'C:/Users/Serhiy/Documents/Python Scripts/kyiv-store-population-analysis/data/processed/distances_new_stores.csv'
distances_new_stores_df.to_csv(distances_output_path, index=False)
# Print the distances_new_stores_df for verification
print("\nDistances New Stores DataFrame:")
print(distances_new_stores_df)
Distances New Stores DataFrame:
Building Id New Store Id Distance (km)
0 1 1 5.836
1 1 2 14.251
2 1 3 4.070
3 1 4 9.600
4 1 5 17.616
... ... ... ...
247880 12748 51 4.928
247881 12748 52 0.403
247882 12748 53 9.678
247883 12748 54 3.041
247884 12748 55 9.261
[247885 rows x 3 columns]
# Drop columns if they exist to ensure recalculation
if 'Distance to Nearest New Store (km)' in filtered_population_df.columns:
filtered_population_df.drop(columns=['Distance to Nearest New Store (km)'], inplace=True)
# Ensure distances_new_stores_df contains the right columns
print(distances_new_stores_df.columns)
# Calculate the minimum distance to the nearest new store for each building
min_distances = distances_new_stores_df.groupby('Building Id')['Distance (km)'].min().reset_index()
min_distances.rename(columns={'Distance (km)': 'Distance to Nearest New Store (km)'}, inplace=True)
# Merge the minimum distances back to the filtered_population_df
filtered_population_df = filtered_population_df.merge(min_distances, on='Building Id', how='left')
# Output the updated DataFrame for verification
print("\nFiltered Population DataFrame with Minimum Distances:")
print(filtered_population_df.head())
Index(['Building Id', 'New Store Id', 'Distance (km)'], dtype='object')
Filtered Population DataFrame with Minimum Distances:
Latitude Longitude Metric population Building Id Nearest Store \
0 50.435976 30.625646 1.96 1 Store 84
1 50.435185 30.626106 1.95 2 Store 84
2 50.436381 30.626037 1.70 3 Store 84
3 50.436153 30.626818 2.09 4 Store 84
4 50.435430 30.626640 2.13 5 Store 84
Distance to Nearest Store (km) Cluster Nearest New Store Id \
0 1.372 45 46
1 1.286 45 46
2 1.385 45 46
3 1.329 45 46
4 1.279 45 46
Distance to Nearest New Store (km)
0 0.325
1 0.392
2 0.272
3 0.274
4 0.355
# Drop columns if they exist to ensure recalculation
columns_to_drop = ['Population Served', 'Population Served within 1 km', 'Population Served within 500 meters',
'Rank by Population Served within 1 km', 'Rank by Population Served']
new_stores_df.drop(columns=[col for col in columns_to_drop if col in new_stores_df.columns], inplace=True)
# Calculate the Population served by each new store without any distance limits
population_served = filtered_population_df.groupby('Nearest New Store Id')['Metric population'].sum().reset_index()
population_served.rename(columns={'Metric population': 'Population Served', 'Nearest New Store Id': 'New Store Id'}, inplace=True)
# Calculate the Population served within 1 km by each new store
population_served_within_1km = filtered_population_df[filtered_population_df['Distance to Nearest New Store (km)'] <= 1]
population_served_within_1km = population_served_within_1km.groupby('Nearest New Store Id')['Metric population'].sum().reset_index()
population_served_within_1km.rename(columns={'Metric population': 'Population Served within 1 km', 'Nearest New Store Id': 'New Store Id'}, inplace=True)
# Calculate the Population served within 500 meters by each new store
population_served_within_500m = filtered_population_df[filtered_population_df['Distance to Nearest New Store (km)'] <= 0.5]
population_served_within_500m = population_served_within_500m.groupby('Nearest New Store Id')['Metric population'].sum().reset_index()
population_served_within_500m.rename(columns={'Metric population': 'Population Served within 500 meters', 'Nearest New Store Id': 'New Store Id'}, inplace=True)
# Merge the calculated columns with new_stores_df
new_stores_df = new_stores_df.merge(population_served, on='New Store Id', how='left')
new_stores_df = new_stores_df.merge(population_served_within_1km, on='New Store Id', how='left')
new_stores_df = new_stores_df.merge(population_served_within_500m, on='New Store Id', how='left')
# Rank each of the new stores by the population served within 1 km and overall population served
new_stores_df['Rank by Population Served within 1 km'] = new_stores_df['Population Served within 1 km'].rank(ascending=False)
new_stores_df['Rank by Population Served'] = new_stores_df['Population Served'].rank(ascending=False)
# Print the new_stores_df for verification
print("New Stores DataFrame with Population Served and Rankings:")
print(new_stores_df)
# Save the updated new_stores_df to the processed folder
new_stores_output_path = 'C:/Users/Serhiy/Documents/Python Scripts/kyiv-store-population-analysis/data/processed/new_stores_population_served.csv'
new_stores_df.to_csv(new_stores_output_path, index=False)
New Stores DataFrame with Population Served and Rankings:
Latitude Longitude New Store Id Cluster Population Served \
0 50.438631 30.543354 1 0 389.92
1 50.444764 30.424879 2 1 217.30
2 50.472520 30.628928 3 2 355.73
3 50.421929 30.491920 4 3 158.31
4 50.397697 30.384392 5 4 366.97
5 50.352596 30.444249 6 5 81.98
6 50.482612 30.394921 7 6 113.31
7 50.421131 30.693051 8 7 119.11
8 50.437814 30.644765 9 8 128.91
9 50.479976 30.456524 10 9 154.58
10 50.461814 30.363162 11 10 258.46
11 50.527811 30.625097 12 11 93.80
12 50.392600 30.498449 13 12 229.14
13 50.398079 30.601024 14 13 67.57
14 50.434941 30.617722 15 14 440.27
15 50.453836 30.524088 16 15 172.60
16 50.402865 30.511342 17 16 253.57
17 50.463061 30.325782 18 17 17.87
18 50.365180 30.555145 19 18 56.20
19 50.364917 30.460809 20 19 199.12
20 50.393601 30.469981 21 20 157.08
21 50.448731 30.673295 22 21 147.15
22 50.464711 30.420668 23 22 147.12
23 50.420380 30.472635 24 23 323.66
24 50.504798 30.587559 25 24 100.68
25 50.425274 30.429414 26 25 315.22
26 50.432047 30.326832 27 26 78.39
27 50.488440 30.478337 28 27 142.42
28 50.497198 30.430037 29 28 115.75
29 50.447007 30.615346 30 29 268.30
30 50.385117 30.696882 31 30 106.58
31 50.349304 30.424055 32 31 80.51
32 50.433042 30.508399 33 32 269.42
33 50.490876 30.333092 34 33 106.06
34 50.348885 30.479032 35 34 127.24
35 50.436727 30.365370 36 35 183.73
36 50.394514 30.650282 37 36 29.51
37 50.398806 30.530522 38 37 180.94
38 50.472317 30.597780 39 38 45.93
39 50.379228 30.542168 40 39 89.29
40 50.476859 30.671792 41 40 49.22
41 50.426168 30.667537 42 41 166.77
42 50.389230 30.626414 43 42 76.85
43 50.529151 30.364861 44 43 13.76
44 50.415486 30.389666 45 44 49.22
45 50.438510 30.627935 46 45 223.86
46 50.450513 30.512374 47 46 164.36
47 50.386530 30.483884 48 47 224.23
48 50.436604 30.472337 49 48 204.19
49 50.464045 30.438674 50 49 212.69
50 50.436311 30.414102 51 50 337.71
51 50.475703 30.442185 52 51 308.13
52 50.407473 30.361742 53 52 22.83
53 50.488920 30.410724 54 53 163.73
54 50.424459 30.551663 55 54 156.09
Population Served within 1 km Population Served within 500 meters \
0 374.92 182.72
1 208.89 125.79
2 315.53 139.84
3 151.18 97.92
4 349.29 187.21
5 55.05 28.88
6 111.11 100.43
7 92.98 17.96
8 122.67 101.15
9 135.95 89.91
10 226.64 159.72
11 93.80 83.63
12 227.14 176.71
13 53.45 34.83
14 434.89 312.15
15 151.06 137.76
16 245.94 134.98
17 15.57 15.57
18 53.93 42.99
19 196.94 158.65
20 150.70 83.29
21 140.26 80.30
22 144.74 54.99
23 280.80 116.97
24 100.68 81.61
25 267.44 112.83
26 68.45 28.00
27 111.66 28.71
28 96.86 68.60
29 264.40 138.77
30 76.41 45.83
31 70.77 59.42
32 267.00 93.44
33 101.66 52.70
34 116.77 69.87
35 115.62 61.43
36 26.25 20.78
37 168.19 83.09
38 43.02 38.61
39 80.79 35.29
40 49.22 45.78
41 166.77 132.09
42 76.85 76.85
43 13.76 13.76
44 49.22 22.71
45 223.86 178.41
46 164.36 164.36
47 224.23 138.65
48 202.45 125.98
49 207.92 120.13
50 310.70 99.45
51 303.28 208.78
52 22.83 14.09
53 135.49 47.61
54 153.69 55.67
Rank by Population Served within 1 km Rank by Population Served
0 2.0 2.0
1 16.0 16.0
2 4.0 4.0
3 24.0 26.0
4 3.0 3.0
5 46.0 43.0
6 35.0 37.0
7 40.0 35.0
8 31.0 33.0
9 29.0 29.0
10 13.0 11.0
11 39.0 41.0
12 12.0 13.0
13 48.0 47.0
14 1.0 1.0
15 25.0 22.0
16 11.0 12.0
17 54.0 54.0
18 47.0 48.0
19 19.0 19.0
20 26.0 27.0
21 28.0 30.0
22 27.0 31.0
23 7.0 6.0
24 37.0 40.0
25 8.0 7.0
26 45.0 45.0
27 34.0 32.0
28 38.0 36.0
29 10.0 10.0
30 43.0 38.0
31 44.0 44.0
32 9.0 9.0
33 36.0 39.0
34 32.0 34.0
35 33.0 20.0
36 52.0 52.0
37 20.0 21.0
38 51.0 51.0
39 41.0 42.0
40 49.5 49.5
41 21.0 23.0
42 42.0 46.0
43 55.0 55.0
44 49.5 49.5
45 15.0 15.0
46 22.0 24.0
47 14.0 14.0
48 18.0 18.0
49 17.0 17.0
50 5.0 5.0
51 6.0 8.0
52 53.0 53.0
53 30.0 25.0
54 23.0 28.0
Overview The cluster map visualizes potential new store locations based on population distribution in Kyiv, helping stakeholders identify optimal store sites to maximize population served.
Key Features
1. Cluster Visualization:
2. New Store Locations:
Usage Instructions
Note. Cluster Centers: K-Means algorithm centers may be influenced by distant buildings.
# Create a new map for cluster visualization
cluster_map = folium.Map(location=[50.4501, 30.5234], zoom_start=12)
# Define colors for clusters
colors = ['red', 'blue', 'green', 'purple', 'orange', 'darkred', 'darkblue', 'darkgreen', 'darkpurple', 'cadetblue', 'black']
# Add building points with cluster colors
for idx, row in filtered_population_df.iterrows():
color = colors[row['Cluster'] % len(colors)]
folium.CircleMarker(
location=[row['Latitude'], row['Longitude']],
radius=row['Metric population'] * 2,
color=None,
weight=0,
fill=True,
fill_color=color,
fill_opacity=0.6,
tooltip=(f"Metric population: {row['Metric population']}<br>"
f"Nearest New Store Id: {row['Nearest New Store Id']}<br>"
f"Distance to Nearest New Store (km): {row['Distance to Nearest New Store (km)']:.2f}")
).add_to(cluster_map)
# Add new store locations to the map with additional information
for idx, row in new_stores_df.iterrows():
popup_content = (f"<b>New Store Id {int(row['New Store Id'])}</b><br>"
f"Population Served: {row['Population Served']}<br>"
f"Rank by Population Served: {row['Rank by Population Served']}<br>"
f"Population Served within 1 km: {row['Population Served within 1 km']}<br>"
f"Rank by Population Served within 1 km: {row['Rank by Population Served within 1 km']}")
folium.Marker(
location=[row['Latitude'], row['Longitude']],
popup=folium.Popup(popup_content, max_width=250),
icon=folium.DivIcon(
html=f"""
<div style="position: relative; width: 20px; height: 30px; margin-top: -30px;">
<div style="background-color: darkblue; color: white; text-align: center; border-radius: 10px 10px 0 0; width: 20px; height: 20px; line-height: 20px; box-shadow: 0 0 3px rgba(0,0,0,0.3); position: absolute; top: 0; left: 50%; transform: translateX(-50%);">
{int(row['Rank by Population Served'])}
</div>
<div style="position: absolute; bottom: 0; left: 50%; transform: translateX(-50%); width: 0; height: 0; border-left: 10px solid transparent; border-right: 10px solid transparent; border-top: 10px solid darkblue;"></div>
</div>
"""
)
).add_to(cluster_map)
# Save the cluster map to the specified folder
cluster_map_output_path = 'C:/Users/Serhiy/Documents/Python Scripts/kyiv-store-population-analysis/results/kyiv_new_store_locations_map.html'
cluster_map.save(cluster_map_output_path)
# Display the cluster map
cluster_map